# Direct outputs not eligible for public release

# GLOBAL SETTINGS --------------------------------------------------------------

options(
    scipen = 999,
    digits = 16,
    max.print = .Machine$integer.max,
    show.error.locations = TRUE,
    warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------
library(data.table)
library(fixest)

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1L)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# fixest
setFixest_nthreads(nthreads = 1L) # for OLS

# BEGIN FILE -------------------------------------------------------------------

# Read in all of the annual files and combine
# - use "_slim" files as they aren't as wide / more memory-efficient
file_list <-
    list.files(
        path = "~/population-annual-data",
        pattern = "lottery\\_data\\_slim",
        full.names = TRUE
    )

lottery_dt_pooled <- list()
ii <- 0L
for (ff in file_list) {
    ii <- ii + 1L
    lottery_dt_pooled[[ii]] <- readRDS(file = ff)
    lottery_dt_pooled[[ii]] <- setDT(lottery_dt_pooled[[ii]])
}
ii <- NULL
ff <- NULL
file_list <- NULL
rm(ii, ff, file_list)
lottery_dt_pooled <- rbindlist(l = lottery_dt_pooled, use.names = TRUE)

setorderv(lottery_dt_pooled, c("tin", "tax_yr"))

# Make event time
lottery_dt_pooled[, event_time := as.integer(tax_yr - win_yr)]

# Now flag the age in win year, and those aged 21-64 in their own win year
lottery_dt_pooled[
    ,
    age_in_win_year := sum(as.integer(tax_yr == win_yr) * age),
    by = .(tin)
]

# Will estimate relative to -2
lottery_dt_pooled <-
    lottery_dt_pooled[
        between(age_in_win_year, 21L, 64L, incbounds = TRUE) &
            between(event_time, -7L, 5L, incbounds = TRUE)
    ]

# Will estimate relative to -2 for the normalization
# so don't make a dummy for that event time
current_cols <- copy(colnames(lottery_dt_pooled))
lottery_dt_pooled[, event_time_pre7 := as.integer(event_time == -7L)]
lottery_dt_pooled[, event_time_pre6 := as.integer(event_time == -6L)]
lottery_dt_pooled[, event_time_pre5 := as.integer(event_time == -5L)]
lottery_dt_pooled[, event_time_pre4 := as.integer(event_time == -4L)]
lottery_dt_pooled[, event_time_pre3 := as.integer(event_time == -3L)]
lottery_dt_pooled[, event_time_pre1 := as.integer(event_time == -1L)]
lottery_dt_pooled[, event_time_0 := as.integer(event_time == 0L)]
lottery_dt_pooled[, event_time_1 := as.integer(event_time == 1L)]
lottery_dt_pooled[, event_time_2 := as.integer(event_time == 2L)]
lottery_dt_pooled[, event_time_3 := as.integer(event_time == 3L)]
lottery_dt_pooled[, event_time_4 := as.integer(event_time == 4L)]
lottery_dt_pooled[, event_time_5 := as.integer(event_time == 5L)]
dummy_cols <- setdiff(colnames(lottery_dt_pooled), current_cols)

fd_pooled_formula <-
    paste0(
        "db_w2_wages",
        "~",
        paste0(dummy_cols, collapse = "+"),
        "|",
        "factor(age) + factor(tax_yr)"
    )

fd_pooled_coefs <-
    coef(feols(as.formula(fd_pooled_formula), data = lottery_dt_pooled))

fd_pooled_coefs <-
    rbindlist(
        list(
            data.table(
                event_time = setdiff(-7L:5L, -2L),
                estimate = fd_pooled_coefs
            ),
            data.table(event_time = -2L, estimate = 0)
        ),
        use.names = TRUE
    )
setorderv(fd_pooled_coefs, "event_time")

fd_pooled_coefs[, estimate_type := "coefficient"]

# Now we build the levels up from the above contrasts
# - just to add back the population mean so the intercept is correct
fd_pooled_levels <- copy(fd_pooled_coefs)

# Add back the population mean
fd_pooled_levels[, estimate := estimate + mean(lottery_dt_pooled$db_w2_wages)]

# Pin illustrative values for figure
# - pre_line_val: value in omitted year, -2
# - post_line_val: mean in the first five years post win
pre_line_val <-
    round(
        fd_pooled_levels[event_time == -2L]$estimate,
        digits = 2L
    )
post_line_val <-
    round(
        mean(fd_pooled_levels[between(event_time, 1L, 5L)]$estimate),
        digits = 2L
    )

fd_pooled_levels[event_time <= 0L, pre_line := pre_line_val]
fd_pooled_levels[event_time >= 0L, post_line := post_line_val]

fd_pooled_levels[, estimate_type := "level"]

# Combine into result set
fd_pooled_coefs[, c("pre_line", "post_line") := NA]
fd_pooled_results <-
    rbindlist(list(fd_pooled_levels, fd_pooled_coefs), use.names = TRUE)

saveRDS(fd_pooled_results, "~/estimation-output/pooled-fd-results.rds")

# Housekeeping
lottery_dt_pooled <- NULL
current_cols <- NULL
dummy_cols <- NULL
fd_pooled_formula <- NULL
fd_pooled_coefs <- NULL
fd_pooled_levels <- NULL
pre_line_val <- NULL
post_line_val <- NULL
fd_pooled_results <- NULL
rm(
    lottery_dt_pooled,
    current_cols,
    dummy_cols,
    fd_pooled_formula,
    fd_pooled_coefs,
    fd_pooled_levels,
    pre_line_val,
    post_line_val,
    fd_pooled_results
)